In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib_venn import venn2, venn3
import os, re, json
from utility_functions import *
DATE
Out[1]:
'20250627'
In [2]:
working_folder = "C:/Users/Enrico/OneDrive - UGent/run-ionbot"
PXDs = [
"PXD002057.v0.11.4",
"PXD005833.v0.11.4",
"PXD014258.v0.11.4"
]
# filtering = 'global'
# filtering = 'custom'
filtering = 'hybrid'
In [3]:
data = []
for dataset_name in PXDs:
data.append(pd.read_csv(os.path.join(working_folder, dataset_name, f"openprot-x-trembl-filt-{filtering}-outerjoin.csv")))
for _ in data:
print(_.shape)
data = pd.concat(data, ignore_index=True)
print(data.shape)
data.tail()
(47930, 66) (163741, 66) (131280, 66) (342951, 66)
Out[3]:
| spectrum_title | scan | spectrum_file | precursor_mass_trembl | database_peptide_trembl | matched_peptide_trembl | modifications_trembl | database_trembl | psm_score_trembl | global_q_trembl | ... | all-explained_open | by-intensity-pattern-correlation_open | top_tag_rank_nterm_open | top_tag_rank_cterm_open | top_tag_rank_open | predicted_retention_time_open | retention_time_error_adjusted_open | Same_peptide | Same_mod_peptide | Same_mods_noRT | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 342946 | ESC-HF-SampleHela5:controllerType=0 controller... | 9994 | ESC-HF-SampleHela5 | 1205.651464 | HLSVNDLPVGR | HLSVNDLPVGR | Unmodified | T | 2.325810 | 0.000057 | ... | 0.2528 | 0.7855 | 0.0 | 1.0 | 0.0 | 1023.251153 | 100.694167 | True | True | True |
| 342947 | ESC-HF-SampleHela5:controllerType=0 controller... | 9996 | ESC-HF-SampleHela5 | 1400.780981 | TFIAIKPDGVQR | TFIAIKPDGVQR | 6|[1263]Gly[K](144.07)_or_6|[4]Carbamidomethyl... | T | 0.764866 | 0.001069 | ... | 0.2335 | 0.7031 | 0.0 | 17.0 | 0.0 | 1274.538023 | 150.482963 | True | False | False |
| 342948 | ESC-HF-SampleHela5:controllerType=0 controller... | 9997 | ESC-HF-SampleHela5 | 1324.626854 | KFEEIPIAHIK | KFEEIPIAHIK | Unmodified | T | 1.441000 | 0.000057 | ... | 0.1061 | 0.8813 | 3.0 | 4.0 | 3.0 | 1105.713169 | 14.384891 | True | True | True |
| 342949 | ESC-HF-SampleHela5:controllerType=0 controller... | 9998 | ESC-HF-SampleHela5 | 1436.764379 | GVTFNVTTVDTKR | GVTFNVTTVDTKR | Unmodified | T | 1.822180 | 0.000057 | ... | 0.2252 | 0.6929 | 0.0 | 0.0 | 0.0 | 1159.870875 | 35.633775 | True | True | True |
| 342950 | ESC-HF-SampleHela5:controllerType=0 controller... | 9999 | ESC-HF-SampleHela5 | 1334.689550 | GNEIEPNFSATR | GNEIEPNFSATR | Unmodified | T | 1.931300 | 0.000057 | ... | 0.1597 | 0.7231 | 0.0 | 2.0 | 0.0 | 1285.778264 | 161.457404 | True | True | True |
5 rows × 66 columns
General plot¶
In [4]:
# searches overall overlap
A = data[~data.database_trembl.isna()].spectrum_title
B = data[~data.database_open.isna()].spectrum_title
venn2([set(A),set(B)],
set_labels=['TrEMBL','OpenProt'],
set_colors=[project_palette['trembl'], project_palette['openprot']])
plt.title('Identified spectra overlap (all datasets)')
plt.savefig(f"publication-data/{DATE}-overall-overlap-trembl-openprot-{filtering}-filtering.svg")
In [5]:
len(set(B))/len(set(A))
Out[5]:
0.9405282625498633
In [6]:
F, counts = make_sankey_plot_with_counts(data, suffixes=['_trembl','_open'])
F.write_image(f"publication-data/{DATE}-Sankey-trembl-openprot-{filtering}-filtering.svg")
F.show()
In [7]:
data3 = counts.loc[['Canonical+Unmodified/Expected','Canonical+Unexpected',
'Decoy','Unidentified'],
['Canonical+Unmodified/Expected','Canonical+Unexpected',
'NonCanonical+Unmodified/Expected','NonCanonical+Unexpected',
'Decoy','Unidentified']]
data3.style.background_gradient()
Out[7]:
| sankey_open | Canonical+Unmodified/Expected | Canonical+Unexpected | NonCanonical+Unmodified/Expected | NonCanonical+Unexpected | Decoy | Unidentified |
|---|---|---|---|---|---|---|
| sankey_trembl | ||||||
| Canonical+Unmodified/Expected | 226987 | 1207 | 677 | 728 | 259 | 15073 |
| Canonical+Unexpected | 1281 | 78567 | 476 | 866 | 73 | 7862 |
| Decoy | 199 | 73 | 21 | 20 | 879 | 2425 |
| Unidentified | 1796 | 1165 | 679 | 865 | 773 | 0 |
In [8]:
# All spectra
tmp = data3.iloc[:,:]
print(tmp.sum().sum())
print(f"{tmp.sum().sum() / data3.sum().sum():.1%}")
tmp
342951 100.0%
Out[8]:
| sankey_open | Canonical+Unmodified/Expected | Canonical+Unexpected | NonCanonical+Unmodified/Expected | NonCanonical+Unexpected | Decoy | Unidentified |
|---|---|---|---|---|---|---|
| sankey_trembl | ||||||
| Canonical+Unmodified/Expected | 226987 | 1207 | 677 | 728 | 259 | 15073 |
| Canonical+Unexpected | 1281 | 78567 | 476 | 866 | 73 | 7862 |
| Decoy | 199 | 73 | 21 | 20 | 879 | 2425 |
| Unidentified | 1796 | 1165 | 679 | 865 | 773 | 0 |
In [9]:
# Canonical --> Canonical
tmp = data3.iloc[:2,:2]
print(tmp.sum().sum())
print(f"{tmp.sum().sum() / data3.sum().sum():.1%}")
tmp
308042 89.8%
Out[9]:
| sankey_open | Canonical+Unmodified/Expected | Canonical+Unexpected |
|---|---|---|
| sankey_trembl | ||
| Canonical+Unmodified/Expected | 226987 | 1207 |
| Canonical+Unexpected | 1281 | 78567 |
In [10]:
# Canonical --> Unidentified
tmp = data3.iloc[:2,-1:]
print(tmp.sum().sum())
print(f"{tmp.sum().sum() / data3.sum().sum():.1%}")
tmp
22935 6.7%
Out[10]:
| sankey_open | Unidentified |
|---|---|
| sankey_trembl | |
| Canonical+Unmodified/Expected | 15073 |
| Canonical+Unexpected | 7862 |
In [11]:
# Canonical --> NonCanonical
tmp = data3.iloc[:2,2:4]
print(tmp.sum().sum())
print(f"{tmp.sum().sum() / data3.sum().sum():.1%}")
tmp
2747 0.8%
Out[11]:
| sankey_open | NonCanonical+Unmodified/Expected | NonCanonical+Unexpected |
|---|---|---|
| sankey_trembl | ||
| Canonical+Unmodified/Expected | 677 | 728 |
| Canonical+Unexpected | 476 | 866 |
In [12]:
# Any Peptide --> Any Peptide
tmp = data3.iloc[:-1,:-1]
print(tmp.sum().sum())
print(f"{tmp.sum().sum() / data3.sum().sum():.1%}")
tmp
312313 91.1%
Out[12]:
| sankey_open | Canonical+Unmodified/Expected | Canonical+Unexpected | NonCanonical+Unmodified/Expected | NonCanonical+Unexpected | Decoy |
|---|---|---|---|---|---|
| sankey_trembl | |||||
| Canonical+Unmodified/Expected | 226987 | 1207 | 677 | 728 | 259 |
| Canonical+Unexpected | 1281 | 78567 | 476 | 866 | 73 |
| Decoy | 199 | 73 | 21 | 20 | 879 |
In [13]:
# Unidentified --> Canonical
tmp = data3.iloc[[3],:2]
print(tmp.sum().sum())
print(f"{tmp.sum().sum() / data3.sum().sum():.1%}")
tmp
2961 0.9%
Out[13]:
| sankey_open | Canonical+Unmodified/Expected | Canonical+Unexpected |
|---|---|---|
| sankey_trembl | ||
| Unidentified | 1796 | 1165 |
In [14]:
# Any --> NonCanonical
tmp = data3.iloc[:,2:4]
print(tmp.sum().sum())
print(f"{tmp.sum().sum() / data3.sum().sum():.1%}")
tmp
4332 1.3%
Out[14]:
| sankey_open | NonCanonical+Unmodified/Expected | NonCanonical+Unexpected |
|---|---|---|
| sankey_trembl | ||
| Canonical+Unmodified/Expected | 677 | 728 |
| Canonical+Unexpected | 476 | 866 |
| Decoy | 21 | 20 |
| Unidentified | 679 | 865 |
Zoom on noncanon¶
In [15]:
F, _ = make_sankey_plot_with_counts(data[(data.isCanonical_open=='NonCanonical')&(data.database_open=='T')],
suffixes=['_trembl','_open'])
F.write_image(f"publication-data/{DATE}-Zoomed-Sankey-trembl-openprot-{filtering}-filtering.svg")
F.show()
In [ ]:
autosave(extra_labels='-'+filtering)
filtering